suppressPackageStartupMessages(library(diffcyt))
suppressPackageStartupMessages(library(rrcov))
suppressPackageStartupMessages(library(CATALYST))
suppressPackageStartupMessages(library(ExperimentHub))
suppressPackageStartupMessages(library(moveHMM))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(flowCore))
suppressPackageStartupMessages(library(scater))
suppressPackageStartupMessages(library(SingleCellExperiment))
suppressPackageStartupMessages(library(ExperimentHub))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(UpSetR))
suppressPackageStartupMessages(library(PCAtools))
suppressPackageStartupMessages(library(mclust))
suppressPackageStartupMessages(library(clue))
eh <- ExperimentHub()
## snapshotDate(): 2019-10-22
sce <- eh[["EH1532"]]
## snapshotDate(): 2019-10-22
## see ?DuoClustering2018 and browseVignettes('DuoClustering2018') for documentation
## loading from cache
rownames(sce) <- paste0(rowData(sce)$id, "_", rowData(sce)$symbol)
sce
## class: SingleCellExperiment
## dim: 15716 3994
## metadata(1): log.exprs.offset
## assays(3): counts logcounts normcounts
## rownames(15716): ENSG00000237683_AL627309.1
## ENSG00000228327_RP11-206L10.2 ... ENSG00000215700_PNRC2
## ENSG00000215699_SRSF10
## rowData names(10): id symbol ... total_counts log10_total_counts
## colnames(3994): b.cells1147 b.cells6276 ... naive.t1167 naive.t4926
## colData names(14): dataset barcode ... libsize.drop feature.drop
## reducedDimNames(2): PCA TSNE
## spikeNames(0):
## altExpNames(0):
populations=sce@colData@listData[["phenoid"]]
populations2=as.data.frame(sce@colData@listData[["phenoid"]])
sum(populations[1:3994]=="b.cells")
## [1] 499
sum(populations[1:3994]=="naive.cytotoxic")
## [1] 398
sum(populations[1:3994]=="cd14.monocytes")
## [1] 600
sum(populations[1:3994]=="cd4.t.helper")
## [1] 400
sum(populations[1:3994]=="cd56.nk")
## [1] 600
sum(populations[1:3994]=="memory.t")
## [1] 500
sum(populations[1:3994]=="naive.t")
## [1] 499
total_counts=sce@colData@listData[["total_counts"]]
bcell_depth=sum(total_counts[1:499])
naive_cytotoxic_depth=sum(total_counts[500:897])
cd14_monocytes_depth=sum(total_counts[898:1497])
cd14_t_helper_depth=sum(total_counts[1498:1897])
cd56nk_depth=sum(total_counts[1898:2497])
memory_t_depth=sum(total_counts[2498:2997])
naive_t_depth=sum(total_counts[2998:3969])
bcell_depth
## [1] 737752
naive_cytotoxic_depth
## [1] 617717
cd14_monocytes_depth
## [1] 644180
cd14_t_helper_depth
## [1] 528095
cd56nk_depth
## [1] 837739
memory_t_depth
## [1] 828500
naive_t_depth
## [1] 1393245
(mito <- grep("MT-", rownames(sce), value = TRUE))
## [1] "ENSG00000198888_MT-ND1" "ENSG00000198763_MT-ND2"
## [3] "ENSG00000198804_MT-CO1" "ENSG00000198712_MT-CO2"
## [5] "ENSG00000228253_MT-ATP8" "ENSG00000198899_MT-ATP6"
## [7] "ENSG00000198938_MT-CO3" "ENSG00000198840_MT-ND3"
## [9] "ENSG00000212907_MT-ND4L" "ENSG00000198886_MT-ND4"
## [11] "ENSG00000198786_MT-ND5" "ENSG00000198695_MT-ND6"
## [13] "ENSG00000198727_MT-CYB"
df <- perCellQCMetrics(sce, subsets=list(Mito=mito))
df
## DataFrame with 3994 rows and 10 columns
## sum detected percent_top_50 percent_top_100 percent_top_200
## <numeric> <integer> <numeric> <numeric> <numeric>
## 1 1096 427 48.1751824817518 66.7883211678832 79.2883211678832
## 2 857 340 51.1085180863477 69.0781796966161 83.6639439906651
## 3 1593 602 46.2649089767734 63.0257376020088 74.7645951035782
## 4 743 410 42.7994616419919 57.8734858681023 71.7362045760431
## 5 1031 411 51.0184287099903 67.3132880698351 79.5344325897187
## ... ... ... ... ... ...
## 3990 1259 545 43.6060365369341 59.0945194598888 72.5972994440032
## 3991 1259 504 46.6243050039714 62.6687847498014 75.8538522637014
## 3992 1290 614 41.0077519379845 55.0387596899225 67.906976744186
## 3993 805 365 49.9378881987578 64.9689440993789 79.5031055900621
## 3994 1224 498 46.6503267973856 63.1535947712418 75.6535947712418
## percent_top_500 subsets_Mito_sum subsets_Mito_detected
## <numeric> <numeric> <integer>
## 1 100 21 7
## 2 100 21 8
## 3 93.5969868173258 37 9
## 4 100 61 10
## 5 100 26 9
## ... ... ... ...
## 3990 96.4257347100874 24 8
## 3991 99.6822875297855 16 7
## 3992 91.1627906976744 26 9
## 3993 100 7 4
## 3994 100 23 9
## subsets_Mito_percent total
## <numeric> <numeric>
## 1 1.91605839416058 1096
## 2 2.45040840140023 857
## 3 2.32266164469554 1593
## 4 8.20995962314939 743
## 5 2.52182347235693 1031
## ... ... ...
## 3990 1.90627482128674 1259
## 3991 1.27084988085782 1259
## 3992 2.01550387596899 1290
## 3993 0.869565217391304 805
## 3994 1.87908496732026 1224
discard.mito <- isOutlier(df$subsets_Mito_percent, type="higher")
summary(discard.mito)
## Mode FALSE TRUE
## logical 3908 86
plot(df$sum, df$subsets_Mito_percent, log="x",
xlab="Total count", ylab='Mitochondrial %')
abline(h=attr(discard.mito, "thresholds")["higher"], col="red")
#### below we can see the distribution of number of counts/cell in all the cells of the dataset (we could also see a visual outlier):
plot(df$sum)
#### From this plot, we could remove some outliers, such as - cells containing above 10% of mitochondrial reads & containing less than 5800 counts:
qc.mito <- df$subsets_Mito_percent > 10
qc.lib <- df$sum > 5000
discard <- qc.lib | qc.mito
DataFrame(LibSize=sum(qc.lib), MitoProp=sum(qc.mito), Total=sum(discard))
## DataFrame with 1 row and 3 columns
## LibSize MitoProp Total
## <integer> <integer> <integer>
## 1 15 10 25
filtered <- sce[,!discard]
sce=filtered
library(scran)
##
## Attaching package: 'scran'
## The following object is masked from 'package:PCAtools':
##
## parallelPCA
clusters <- quickCluster(sce)
sce <- computeSumFactors(sce, cluster=clusters)
sce <- logNormCounts(sce)
suppressPackageStartupMessages(library(scran))
dec.pbmc <- modelGeneVar(sce)
fit.pbmc <- metadata(dec.pbmc)
plot(fit.pbmc$mean, fit.pbmc$var, xlab="Mean of log-expression",
ylab="Variance of log-expression")
curve(fit.pbmc$trend(x), col="red", add=TRUE, lwd=2)
#### Ordering by most interesting genes for inspection.
dec.pbmc[order(dec.pbmc$bio, decreasing=TRUE),]
## DataFrame with 15716 rows and 6 columns
## mean total tech
## <numeric> <numeric> <numeric>
## ENSG00000019582_CD74 1.26474331256936 2.74437092523629 0.787832656781869
## ENSG00000115523_GNLY 0.63952177892004 2.28385043090429 0.547202802756434
## ENSG00000163220_S100A9 0.498584133120091 1.67913170312111 0.452619436823626
## ENSG00000143546_S100A8 0.492033178926322 1.66864659827605 0.447777955785647
## ENSG00000204287_HLA-DRA 0.711610468263255 1.79553646646598 0.588479474393869
## ... ... ... ...
## ENSG00000173812_EIF1 2.09963419307843 0.631699947538411 0.773803246617403
## ENSG00000161970_RPL26 2.86322949797314 0.536421674046619 0.693664303105048
## ENSG00000204525_HLA-C 2.38783488630792 0.560694785922417 0.752938297735789
## ENSG00000234745_HLA-B 2.99508368910097 0.425588062507862 0.669720843921196
## ENSG00000205542_TMSB4X 4.17218387314075 0.302824073065219 0.554934657032776
## bio p.value
## <numeric> <numeric>
## ENSG00000019582_CD74 1.95653826845442 7.4705166588826e-66
## ENSG00000115523_GNLY 1.73664762814786 3.70397836595812e-106
## ENSG00000163220_S100A9 1.22651226629748 5.41292371142383e-78
## ENSG00000143546_S100A8 1.2208686424904 6.2575549729389e-79
## ENSG00000204287_HLA-DRA 1.20705699207211 1.36957060561464e-45
## ... ... ...
## ENSG00000173812_EIF1 -0.142103299078993 0.896967651074246
## ENSG00000161970_RPL26 -0.157242629058429 0.940716831884149
## ENSG00000204525_HLA-C -0.192243511813372 0.960628027392819
## ENSG00000234745_HLA-B -0.244132781413334 0.993962483886251
## ENSG00000205542_TMSB4X -0.252110583967556 0.999120305859297
## FDR
## <numeric>
## ENSG00000019582_CD74 2.92041172487368e-62
## ENSG00000115523_GNLY 5.79191097084871e-102
## ENSG00000163220_S100A9 2.82139626918448e-74
## ENSG00000143546_S100A8 4.89246935559228e-75
## ENSG00000204287_HLA-DRA 3.56932925999935e-42
## ... ...
## ENSG00000173812_EIF1 0.958772517591632
## ENSG00000161970_RPL26 0.987867858950806
## ENSG00000204525_HLA-C 0.999999305456306
## ENSG00000234745_HLA-B 0.999999305456306
## ENSG00000205542_TMSB4X 0.999999305456306
dec.cv2.pbmc <- modelGeneCV2(sce)
fit.cv2.pbmc <- metadata(dec.cv2.pbmc)
plot(fit.cv2.pbmc$mean, fit.cv2.pbmc$cv2, log="xy")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 79 x values <= 0 omitted from
## logarithmic plot
curve(fit.cv2.pbmc$trend(x), col="red", add=TRUE, lwd=2)
#### The deviation for each gene from the trend:
dec.cv2.pbmc[order(dec.cv2.pbmc$ratio, decreasing=TRUE),]
## DataFrame with 15716 rows and 6 columns
## mean total
## <numeric> <numeric>
## ENSG00000254709_IGLL5 0.2712326359366 80.6430620109659
## ENSG00000143546_S100A8 2.01536951484566 11.425633804434
## ENSG00000163220_S100A9 1.98074332220598 10.1020978104105
## ENSG00000115523_GNLY 2.77958806920241 6.46259972848585
## ENSG00000115568_ZNF142 0.00778548688821122 1805.28752873894
## ... ... ...
## ENSG00000075702_WDR62 0 NaN
## ENSG00000170956_CEACAM3 0 NaN
## ENSG00000269403_CTD-2616J11.11 0 NaN
## ENSG00000129991_TNNI3 0 NaN
## ENSG00000223638_RFPL4A 0 NaN
## trend ratio p.value
## <numeric> <numeric> <numeric>
## ENSG00000254709_IGLL5 5.02761448544464 16.0400249948428 0
## ENSG00000143546_S100A8 0.791850753180504 14.4290243566005 0
## ENSG00000163220_S100A9 0.803365904447868 12.5747156488467 0
## ENSG00000115523_GNLY 0.610745834443516 10.5814880168184 0
## ENSG00000115568_ZNF142 170.647916723026 10.5790188559351 0
## ... ... ... ...
## ENSG00000075702_WDR62 Inf NaN NaN
## ENSG00000170956_CEACAM3 Inf NaN NaN
## ENSG00000269403_CTD-2616J11.11 Inf NaN NaN
## ENSG00000129991_TNNI3 Inf NaN NaN
## ENSG00000223638_RFPL4A Inf NaN NaN
## FDR
## <numeric>
## ENSG00000254709_IGLL5 0
## ENSG00000143546_S100A8 0
## ENSG00000163220_S100A9 0
## ENSG00000115523_GNLY 0
## ENSG00000115568_ZNF142 0
## ... ...
## ENSG00000075702_WDR62 NaN
## ENSG00000170956_CEACAM3 NaN
## ENSG00000269403_CTD-2616J11.11 NaN
## ENSG00000129991_TNNI3 NaN
## ENSG00000223638_RFPL4A NaN
hvg.pbmc.var <- getTopHVGs(dec.pbmc, n=2000)
str(hvg.pbmc.var)
## chr [1:2000] "ENSG00000019582_CD74" "ENSG00000115523_GNLY" ...
way1=list(str(hvg.pbmc.var))
## chr [1:2000] "ENSG00000019582_CD74" "ENSG00000115523_GNLY" ...
hvg.pbmc.var.2 <- getTopHVGs(dec.pbmc, fdr.threshold = 0.05)
way2=list(hvg.pbmc.var.2)
suppressPackageStartupMessages(library(UpSetR))
way1=as.data.frame(hvg.pbmc.var)
way2=as.data.frame(hvg.pbmc.var.2)
suppressPackageStartupMessages(library(scater))
sce=runPCA(sce,subset_row=hvg.pbmc.var)
percent.var <- attr(reducedDim(sce), "percentVar")
chosen.elbow <- PCAtools::findElbowPoint(percent.var)
chosen.elbow
## [1] 6
plot(percent.var, xlab="PC", ylab="Variance explained (%)")
abline(v=chosen.elbow, col="red")
reducedDim(sce, "PCA") <- reducedDim(sce, "PCA")[,1:10]
ncol(reducedDim(sce, "PCA"))
## [1] 10
reducedDim(sce, "PCA_10") <- reducedDim(sce, "PCA")[,1:10]
reducedDimNames(sce)
## [1] "PCA" "TSNE" "PCA_10"
plotReducedDim(sce, dimred="PCA",colour_by = "phenoid")
sce<- runUMAP(sce, dimred="PCA")
reducedDimNames(sce)
## [1] "PCA" "TSNE" "PCA_10" "UMAP"
dim(reducedDim(sce, "UMAP"))
## [1] 3969 2
plotReducedDim(sce, dimred="UMAP",colour_by = "phenoid")
plotReducedDim(sce, dimred="UMAP",colour_by = "log10_total_features")
plotReducedDim(sce, dimred="UMAP",colour_by = "log10_total_counts")
suppressPackageStartupMessages(library(scran))
g <- buildSNNGraph(sce, k=18, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7
## 578 516 945 588 502 411 429
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
plotReducedDim(sce, "PCA", colour_by="cluster")
#### Second of the 2 algorithms for clustering - K-means custering:
sce1=sce
clust.kmeans <- kmeans(reducedDim(sce, "PCA"), centers=7)
table(clust.kmeans$cluster)
##
## 1 2 3 4 5 6 7
## 719 576 291 587 690 504 602
sce1$cluster <- factor(clust.kmeans$cluster)
plotReducedDim(sce1, "TSNE", colour_by="cluster")
#### Calculating the adjusted Rand Index of the clusters generated by PCA & k-means:
adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
## [1] 0.7410227
suppressPackageStartupMessages(library(scran))
g <- buildSNNGraph(sce, k=2, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 45 31 264 122 292 356 124 30 421 159 372 360 134 152 245 151 47 63 29 45
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 74 33 15 17 7 13 17 17 16 31 12 8 6 13 8 9 16 8 18 12
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 6 8 6 5 6 9 10 4 4 6 5 5 7 6 4 3 4 3 3 5
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 3 3 6 5 3 5 7 5 4 4 7 6 4 3 3
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k2value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k2clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=3, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 545 138 441 576 417 42 416 481 463 302 80 34 5 6 16 7
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k3value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k3clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=4, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 556 330 147 377 627 339 477 475 425 72 84 31 7 6 16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k4value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k4clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=5, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8 9 10
## 561 587 859 441 424 422 428 168 63 16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k5value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k5clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=6, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 498 432 432 327 446 553 433 185 115 373 89 70 16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k6value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k6clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=7, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8 9
## 503 561 587 880 429 435 410 148 16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k7value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k7clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=8, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8 9
## 502 562 587 918 397 445 425 117 16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k8value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k8clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=9, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8
## 542 502 562 919 587 435 406 16
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k9value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k9clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=12, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8 9
## 558 501 856 588 421 21 429 424 171
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k12value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k12clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=13, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8 9
## 558 501 837 589 476 440 21 418 129
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k13value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k13clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=15, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6 7 8 9
## 835 558 502 588 454 429 184 399 20
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k15value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k15clusters=nrow(unique(as.data.frame(clust)))
g <- buildSNNGraph(sce, k=20, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
## clust
## 1 2 3 4 5 6
## 1040 578 818 589 502 442
suppressPackageStartupMessages(library(scater))
sce$cluster <- factor(clust)
k20value=adjustedRandIndex(sce$cluster,clust.kmeans$cluster)
k20clusters=nrow(unique(as.data.frame(clust)))
values=data.frame(k2value,k3value,k4value,k5value,k6value,k7value,k8value,k9value,k12value,k13value,k15value,k20value)
clusters=data.frame(k2clusters,k3clusters,k4clusters,k5clusters,k6clusters,k7clusters,k8clusters,k9clusters,k12clusters,k13clusters,k15clusters,k20clusters)
values=as.matrix(values)
clusters=as.matrix(clusters)
plot(clusters,values)